merge_all <- function(data_adm_raw,data_bac_raw,year) {
  ############ Import Data#################
  #adm_save<-data_adm_raw
  
  #bac_save<-data_bac_raw
  #data_bac_raw[is.na(data_bac_raw$unitate_de_invatamant),]
  
  #ADM Data
  # data_adm_raw$judet_adm<-ifelse(data_adm_raw$judet_adm=='MUNICIPIUL BUCUREȘTI','BUCURESTI',data_adm_raw$judet_adm)
  # data_adm_raw$judet_adm<-ifelse(data_adm_raw$judet_adm=='-',NA,data_adm_raw$judet_adm)
  # data_adm_raw$judet_adm<-ifelse(data_adm_raw$judet_adm=='',NA,data_adm_raw$judet_adm)
  
  #add harmonized hs names
  # hs_list<-readRDS(paste("hs_list",".rds",sep=""))
  # 
  # data_bac_raw<-merge(data_bac_raw,hs_list,by.x=c("judet_bac","town_hs_bac","unitate_de_invatamant","an"),by.y=c("judet_bac","town_hs_bac","unitate_de_invatamant","yr"),all.x=T,suffixes=c("",".y"))
  # data_bac_raw$unitate_de_invatamant_old<-data_bac_raw$unitate_de_invatamant
  # data_bac_raw$unitate_de_invatamant<-data_bac_raw$unitate_de_invatamant
  # data_bac_raw<-data_bac_raw[, !colnames(data_bac_raw) %in% c("town_hs_bac.y","dist","judet_bac.y","unitate_de_invatamant")] #remove unwanted columns
  
  #Change appropriate cols to numeric
  data_adm_raw$nota_optiunea_3<-as.numeric(as.character(data_adm_raw$nota_optiunea_3))
  data_adm_raw$nota_lb_materna<-as.numeric(as.character(data_adm_raw$nota_lb_materna))
  # cols.num <- c("nota_optiunea_3","nota_lb_materna")
  # data_adm_raw[cols.num] <- sapply(data_adm_raw[cols.num],as.character)
  # data_adm_raw[cols.num] <- sapply(data_adm_raw[cols.num],as.numeric)
  
  #BAC Data
  data_bac_raw<-data_bac_raw[, !(names(data_bac_raw) %in% c("url","lb_romana_scris","lb_romana_contestatie","lb_materna","lb_materna_scris","lb_materna_oral","lb_materna_contestatie","disciplina_obligatorie_scris_nota","disciplina_obligatorie_scris_contestatie","disciplina_alegere_aria_culiculara_nota","disciplina_alegere_aria_culiculara_contestatie","disciplina_alegere_celelalte_arii_culiculare_nota","disciplina_alegere_celelalte_arii_culiculare_contestatie"))]
  data_bac_raw[data_bac_raw=="-"]<-NA
  data_bac_raw[data_bac_raw==""]<-NA
  data_bac_raw$promotie_anterioara<-trimws(data_bac_raw$promotie_anterioara)
  #Change appropriate cols to numeric
  cols.num <- c("lb_romana_final",
                "lb_materna_final",
                "disciplina_obligatorie_scris_final",
                "disciplina_alegere_aria_culiculara_final",
                "disciplina_alegere_celelalte_arii_culiculare_final",
                "media")
  for (x in cols.num){
  data_bac_raw[,x]<-gsub(",","\\.",data_bac_raw[,x])
  data_bac_raw[,x]<-as.numeric(as.character(data_bac_raw[,x]))
  }
  
  
  
  
  # ######
  # data_bac_raw<-data_bac_raw %>% filter(judet_bac=='ALBA')
  # data_adm_raw<-data_adm_raw %>% filter(judet_adm=='ALBA')
  # ######
  
  
  
  
  
  
  
  #Remove students with censored or missing names
  data_bac_raw<-data_bac_raw[!is.na(data_bac_raw$nume),]
  data_adm_raw<-data_adm_raw[!is.na(data_adm_raw$nume),]

 
  #first, match using perfect name matches
   licee_match_perfect<-semi_join(unique(data_bac_raw[,c("judet_bac","unitate_de_invatamant")]),
                             unique(data_adm_raw[,c("judet_adm","liceu_repartizat")]),
                             by=c("unitate_de_invatamant"="liceu_repartizat","judet_bac"="judet_adm"))
   licee_match_perfect$liceu_repartizat<- licee_match_perfect$unitate_de_invatamant
   if (dim(licee_match_perfect)[1]>0){
   licee_match_perfect$n<-999
   licee_match_perfect$perc_liceu<-1
   licee_match_perfect$perc_unitate<-1
   }
   #licee_match_perfect<- licee_match_perfect %>% rename(judet_bac=judet_bac)
  
  #then, match using percent of students who are best matches
   list_judet<-unique(data_bac_raw[,c("judet_bac")])
   
   list_judet<- anti_join(data_bac_raw,
                          licee_match_perfect,
                          by=c("unitate_de_invatamant","judet_bac"="judet_bac")) %>%
     select(unitate_de_invatamant,judet_bac) %>%
     group_by(unitate_de_invatamant,judet_bac) %>%
     summarize() %>% ungroup
   
   list_judet<-unique(list_judet[,c("judet_bac")])$judet_bac


  
  join_liceu<-lapply(1:length(list_judet), function(x) {
    print(list_judet[x])
    bac_filtered<-data_bac_raw[which(data_bac_raw$judet_bac==list_judet[x] & data_bac_raw$promotie_anterioara=="NU"),]
    bac_filtered<-anti_join(bac_filtered,licee_match_perfect,by=c("unitate_de_invatamant"="unitate_de_invatamant","judet_bac"="judet_bac"))
    
    length_subsample<-5000
    n_subsamples<-ceiling(dim(bac_filtered)[1]/length_subsample)

    bac_filtered<-split(bac_filtered,rep(1:n_subsamples,length.out=nrow(bac_filtered), each=length_subsample))

    adm_filtered<-data_adm_raw[which(data_adm_raw$judet_adm==list_judet[x]),]
    adm_filtered<-anti_join(adm_filtered,licee_match_perfect,by=c("liceu_repartizat"="unitate_de_invatamant","judet_adm"="judet_bac"))
      list<-lapply(1:round(n_subsamples), function(y) {
         joined<- stringdist_left_join(bac_filtered[[y]],adm_filtered, 
                                        by = c("nume"="nume"),
                                        max_dist = 3,
                                        distance_col="dist")
      
      
         return(joined)
         })
    list<-do.call("rbind",list)


    if(dim(list[is.na(list$dist),])[1]>0){
      list[is.na(list$dist),]$dist<-100
    }
    if(!"dist" %in% colnames(list) ){
      list$dist<-100
    }
    list<-list %>% dplyr::group_by(id.x) %>%
      slice(which.min(dist))

    
    return(list)
    })


  
  print("first join done")
  
  join_liceu<-do.call("rbind",join_liceu)
  colnames(join_liceu) = gsub(".x", "_bac", colnames(join_liceu))
  colnames(join_liceu) = gsub(".y", "_adm", colnames(join_liceu))
  #join_liceu<-join_liceu %>% rename(judet_bac=judet)

  join_liceu<-join_liceu %>% dplyr::group_by(judet_bac,unitate_de_invatamant,liceu_repartizat,nume_bac) %>% 
    slice(which.min(dist))
  # join_liceu<-join_liceu %>% dplyr::group_by(judet_bac,unitate_de_invatamant,liceu_repartizat) %>%  
  #   dplyr::summarise (n = n())
  # licee_match <- join_liceu %>% 
  #   group_by(judet_bac,unitate_de_invatamant) %>%
  #   slice(which.max(n))
  # licee_match <- licee_match   %>% 
  #   group_by(judet_bac,liceu_repartizat) %>%
  #   slice(which.max(n))
  

  join_liceu<-join_liceu %>%
          filter(!is.na(liceu_repartizat) & !is.na(unitate_de_invatamant) & liceu_repartizat!='')
  join_liceu<-join_liceu %>% 
    dplyr::group_by(judet_bac,unitate_de_invatamant,liceu_repartizat) %>%  
    summarize(n=n()) %>%
    dplyr::group_by(judet_bac,liceu_repartizat) %>%
    mutate(perc_liceu=n/sum(n)) %>%
    dplyr::group_by(judet_bac,unitate_de_invatamant) %>%
    mutate(perc_unitate=n/sum(n))
  # licee_match <- join_liceu %>%
  #   group_by(judet_bac,unitate_de_invatamant) %>%
  #   mutate(max_unit=perc_unitate==max(perc_unitate)) %>%
  #   group_by(judet_bac,liceu_repartizat) %>%
  #   mutate(max_liceu=perc_liceu==max(perc_liceu))
  # licee_match <- licee_match %>%
  #   filter(max_unit==T | max_liceu==T) %>%
  #   select(-max_unit,-max_liceu) %>% ungroup


  licee_match <- join_liceu %>%
    group_by(judet_bac,unitate_de_invatamant) %>%
    slice(which.max(perc_unitate))
  licee_match <- licee_match   %>%
    group_by(judet_bac,liceu_repartizat) %>%
    slice(which.max(perc_liceu)) %>% ungroup

  licee_match<-rbind(licee_match,licee_match_perfect)

  
  

  
  #find unmatched HS's and try to macth them again
  join_liceu_unmatched_unitate<-anti_join(join_liceu,licee_match,by=c("judet_bac","unitate_de_invatamant"))
  join_liceu_unmatched_liceu<-anti_join(join_liceu,licee_match,by=c("judet_bac","liceu_repartizat"))
  
  #test<-anti_join(unique(data_bac_raw[,c("judet_bac","unitate_de_invatamant")]),licee_match,by=c("unitate_de_invatamant"))
  #test2<-anti_join(unique(data_adm_raw[,c("judet_bac","liceu_repartizat")]),licee_match,by=c("liceu_repartizat"))
  
  
  licee_match_unmatched_unitate <- join_liceu_unmatched_unitate %>%
    group_by(judet_bac,unitate_de_invatamant) %>%
    slice(which.max(perc_unitate)) %>% ungroup %>% filter(n>5,(perc_liceu>0.8|perc_unitate>0.8)& perc_liceu>0.1 & perc_unitate>0.1)
  licee_match_unmatched_liceu <- join_liceu_unmatched_liceu  %>%
    group_by(judet_bac,liceu_repartizat) %>%
    slice(which.max(perc_liceu)) %>% ungroup %>% filter(n>5,(perc_liceu>0.8|perc_unitate>0.8)& perc_liceu>0.1 & perc_unitate>0.1)

   licee_match<-rbind(licee_match,licee_match_unmatched_unitate,licee_match_unmatched_liceu) %>% 
     filter((n>10 &(perc_liceu>=0.5 & perc_unitate>=0.5))| (n>1 &(perc_liceu>=0.8|perc_unitate>=0.8)) & perc_liceu>0.1 & perc_unitate>0.1)%>% select(-perc_liceu,-perc_unitate,-n)
  
  
  
  
  licee_match<-na.omit(licee_match) %>% arrange(judet_bac,unitate_de_invatamant)
  licee_match<-unique(licee_match)
  
  
  #Add HS programme as written in ADM to BAC Data
  data_merged_complete<-lapply(unique(data_bac_raw$judet_bac), function(x) base::merge(data_bac_raw[data_bac_raw$judet_bac==x,],licee_match[licee_match$judet_bac==x,],by.x=c("unitate_de_invatamant","judet_bac"),by.y=c("unitate_de_invatamant","judet_bac"),suffixes = c("_bac",".prog"),all.x=T))
  data_merged_complete<-do.call("rbind",data_merged_complete)
  data_merged_complete<-data_merged_complete[,!colnames(data_merged_complete) %in% c("specializare_adm","n") ]
  
  #add HS programme as written in BAC data to ADM Data
  data_adm_raw<-lapply(unique(data_adm_raw$judet_adm), function(x) base::merge(data_adm_raw[data_adm_raw$judet_adm==x,],licee_match[licee_match$judet_bac==x,],by.x=c("liceu_repartizat","judet_adm"),by.y=c("liceu_repartizat","judet_bac"),suffixes = c("_bac",".prog"),all.x=T))
  data_adm_raw<-do.call("rbind",data_adm_raw)
  data_adm_raw<-data_adm_raw[,!colnames(data_adm_raw) %in% c("specializare_adm","n") ]
  
  #Match students within each hs
  data_merged_complete_matched<-lapply(1:dim(licee_match)[1], function(x) {
   # result<-lapply(unique(str_extract(data_merged_complete$nume,"^[a-zA-Z]")), function(letter){
      left<-data_merged_complete %>% filter(judet_bac==licee_match$judet_bac[x] & liceu_repartizat==licee_match$liceu_repartizat[x] & promotie_anterioara=='NU' )
      right<-data_adm_raw %>% filter(judet_adm==licee_match$judet_bac[x] & liceu_repartizat==licee_match$liceu_repartizat[x] )
      result<-stringdist_left_join(left,
                                   right,
                                   by=c("nume"),max_dist=6,distance_col="dist")
                        if (!"dist" %in% colnames(result)){
                            result$dist<-100
                        }
                        #return(result)  
    #})
    #result<-do.call("rbind",result)
    return(result)
  })
  data_merged_complete_matched<-do.call("rbind",data_merged_complete_matched)
  

  colnames(data_merged_complete_matched) = gsub("\\.x", "_bac", colnames(data_merged_complete_matched))
  colnames(data_merged_complete_matched) = gsub("\\.y", "_adm", colnames(data_merged_complete_matched))
  colnames(data_merged_complete_matched)[colnames(data_merged_complete_matched)=='liceu_repartizat_adm']<-'liceu_repartizat'
  colnames(data_merged_complete_matched)[colnames(data_merged_complete_matched)=='unitate_de_invatamant_bac']<-'unitate_de_invatamant'
  
  data_merged_complete_matched <- data_merged_complete_matched %>%
             rename(liceu_repartizat_corresponding_to_unitate_de_invatamant=liceu_repartizat_bac)
  
  data_merged_complete_matched$dist<-ifelse(is.na(data_merged_complete_matched$dist),100,data_merged_complete_matched$dist)
  data_merged_complete_matched <- data_merged_complete_matched %>% 
    group_by(judet_bac,unitate_de_invatamant,pozitia_pe_tara,nume_bac) %>%
    slice(which.min(dist))
  data_merged_complete_matched <- data_merged_complete_matched %>% 
    group_by(judet_adm,liceu_repartizat,nume_adm,media_la_admitere) %>%
    slice(which.min(dist))
  data_merged_complete_matched <- data_merged_complete_matched %>% 
    ungroup() %>%
    filter(!is.na(nume_adm)) %>%
    group_by(judet_adm,liceu_repartizat,nume_adm,media_la_admitere) %>%
    slice(which.min(dist)) %>% 
    ungroup() %>%
    mutate(school_change=F)
  bac_unmatched_different_cohort<-data_merged_complete %>% filter(promotie_anterioara=='DA') %>% rename(nume_bac=nume,specializare_bac=specializare,id_bac=id)
  data_merged_complete_matched<-bind_rows(bac_unmatched_different_cohort,data_merged_complete_matched)
  
  #find bac students who did not find a match
  bac_unmatched<-anti_join(data_merged_complete,data_merged_complete_matched,by=c("nume"="nume_bac","judet_bac"="judet_bac","pozitia_pe_tara"="pozitia_pe_tara")) 
  colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)]<-
    paste(  colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)],"_bac",sep="")
  #if(nrow(bac_unmatched)>0){
  #bac_unmatched$liceu_repartizat_corresponding_to_unitate_de_invatamant<-bac_unmatched$liceu_repartizat
  #bac_unmatched$liceu_repartizat<-NA
  #bac_unmatched$dist<-999
  # bac_unmatched <- bac_unmatched %>% 
  #   group_by(judet_bac,unitate_de_invatamant,pozitia_pe_tara,nume_bac) %>%
  #   slice(which.min(dist))
 #}

  
  #find adm stuents who did not find a match
  adm_unmatched<-anti_join(data_adm_raw,data_merged_complete_matched,by=c("nume"="nume_adm","judet_adm"="judet_adm","liceu_repartizat"="liceu_repartizat","scoala_de_provenienta"="scoala_de_provenienta","media_la_admitere"="media_la_admitere"))
  colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)]<-
    paste(  colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)],"_adm",sep="")
  #if(nrow(adm_unmatched)>0){
  #adm_unmatched$liceu_repartizat_corresponding_to_unitate_de_invatamant<-adm_unmatched$liceu_repartizat
  #adm_unmatched$dist<-999
  # adm_unmatched <- adm_unmatched %>% 
  #   group_by(judet_adm,liceu_repartizat,nume_adm,media_la_admitere) %>%
  #   slice(which.min(dist))
  #}
 
  #############################################
  #2. match students in diff schools but same county (exact names)
  #by exact name and county
  data_matched_diff_school<-base::merge(adm_unmatched,bac_unmatched,by.x=c("nume_adm","judet_adm"),by.y=c("nume_bac","judet_bac")) %>% mutate(nume_bac=nume_adm,judet_bac=judet_adm)
 
  clean_result<-function(data_matched_diff_school){
    #remove bac duplicates
    data_matched_diff_school<-data_matched_diff_school %>% group_by(judet_bac,nume_bac,media) %>% arrange(abs(media-media_la_admitere)) %>% slice(1)
    #remove adm duplicates
    data_matched_diff_school<-data_matched_diff_school %>% group_by(judet_adm,nume_adm,media_la_admitere) %>% arrange(abs(media-media_la_admitere)) %>% slice(1)
    
    #find bac students who did not find a match
    colnames(data_matched_diff_school) = gsub("\\.x", "_adm", colnames(data_matched_diff_school))
    colnames(data_matched_diff_school) = gsub("\\.y", "_bac", colnames(data_matched_diff_school))
    colnames(data_matched_diff_school)[colnames(data_matched_diff_school)=='liceu_repartizat_adm']<-'liceu_repartizat'
    colnames(data_matched_diff_school)[colnames(data_matched_diff_school)=='unitate_de_invatamant_bac']<-'unitate_de_invatamant'
    data_matched_diff_school <- data_matched_diff_school %>%
      rename(liceu_repartizat_corresponding_to_unitate_de_invatamant=liceu_repartizat_bac) %>%
      ungroup %>%
      mutate(school_change=T,dist=0)
    data_merged_complete_matched<-bind_rows(data_merged_complete_matched,data_matched_diff_school)
    
    bac_unmatched<-anti_join(data_merged_complete,data_merged_complete_matched,by=c("nume"="nume_bac","judet_bac"="judet_bac","pozitia_pe_tara"="pozitia_pe_tara"))
    colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)]<-
      paste(  colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)],"_bac",sep="")
    
    #find adm stuents who did not find a match
    adm_unmatched<-anti_join(data_adm_raw,data_merged_complete_matched,by=c("nume"="nume_adm","judet_adm"="judet_adm","liceu_repartizat"="liceu_repartizat","scoala_de_provenienta"="scoala_de_provenienta","media_la_admitere"="media_la_admitere"))
    colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)]<-
      paste(  colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)],"_adm",sep="")  
    
    result<-list(bac_unmatched,adm_unmatched,data_merged_complete_matched,data_matched_diff_school)
  }
  result<-clean_result(data_matched_diff_school)
  bac_unmatched<-result[[1]]
  adm_unmatched<-result[[2]]
  data_merged_complete_matched<-result[[3]]
  data_matched_diff_school<-result[[4]]

  
  #############################################
  #3. match students in diff schools and diff county (exact names)
  data_matched_diff_school<-base::merge(adm_unmatched,bac_unmatched,by.x=c("nume_adm"),by.y=c("nume_bac")) %>% mutate(nume_bac=nume_adm)
  
  
  result<-clean_result(data_matched_diff_school)
  bac_unmatched<-result[[1]]
  adm_unmatched<-result[[2]]
  data_merged_complete_matched<-result[[3]]
  data_matched_diff_school<-result[[4]]
  
  
  
  #######################################################################
  #4. match students in diff schools and same county (similar names)
  # cl <- makeCluster(detectCores())
  # clusterCall(cl, function() library(stringdist))
  # clusterCall(cl, function() library(fuzzyjoin))
  # clusterCall(cl, function() library(dplyr))
  # clusterCall(cl, function() library(stringr))
  # clusterExport(cl, "bac_unmatched")
  # clusterExport(cl, "adm_unmatched")
  
  #data_matched_diff_school<-parLapply(cl, unique(bac_unmatched$judet_bac), function(county){
  #stopCluster(cl)
  
  data_matched_diff_school<-lapply(unique(bac_unmatched$judet_bac), function(county){
    left<-adm_unmatched %>% filter(judet_adm==county)
    right<-bac_unmatched %>% filter(judet_bac==county)
    result<-stringdist_join(left,
                            right,
                            by=c("nume_adm"="nume_bac"),
                            distance_col="dist",
                            max_dist=3) 

    return(result)  
    })

  data_matched_diff_school<-do.call("rbind",data_matched_diff_school)
  


   clean_result_partial<-function(data_matched_diff_school){
    #get the initials of their names as an extra filter
    data_matched_diff_school<-data_matched_diff_school %>%
      mutate(filter_adm=sapply(str_extract_all(nume_adm,"^[a-zA-Z]|\\s[a-zA-Z]|-[a-zA-Z]"),function(x) paste(x,collapse=""))) %>%
      mutate(filter_adm=gsub("\\s|-","",filter_adm)) %>%
      mutate(filter_bac=sapply(str_extract_all(nume_bac,"^[a-zA-Z]|\\s[a-zA-Z]|-[a-zA-Z]"),function(x) paste(x,collapse=""))) %>%
      mutate(filter_bac=gsub("\\s|-","",filter_bac)) %>%
      filter(filter_adm==filter_bac|dist<=2) %>%
      select(-filter_adm,-filter_bac)
    
    #remove bac duplicates
    data_matched_diff_school<-data_matched_diff_school %>% group_by(judet_bac,nume_bac,media) %>% arrange(dist) %>% slice(1)
    #remove adm duplicates
    data_matched_diff_school<-data_matched_diff_school %>% group_by(judet_adm,nume_adm,media_la_admitere) %>% arrange(dist) %>% slice(1)
    
    #find bac students who did not find a match
    colnames(data_matched_diff_school) = gsub("\\.x", "_adm", colnames(data_matched_diff_school))
    colnames(data_matched_diff_school) = gsub("\\.y", "_bac", colnames(data_matched_diff_school))
    colnames(data_matched_diff_school)[colnames(data_matched_diff_school)=='liceu_repartizat_adm']<-'liceu_repartizat'
    colnames(data_matched_diff_school)[colnames(data_matched_diff_school)=='unitate_de_invatamant_bac']<-'unitate_de_invatamant'
    data_matched_diff_school <- data_matched_diff_school %>%
      rename(liceu_repartizat_corresponding_to_unitate_de_invatamant=liceu_repartizat_bac) %>%
      ungroup %>%
      mutate(school_change=T)
    data_merged_complete_matched<-bind_rows(data_merged_complete_matched,data_matched_diff_school)
    
    bac_unmatched<-anti_join(data_merged_complete,data_merged_complete_matched,by=c("nume"="nume_bac","judet_bac"="judet_bac","pozitia_pe_tara"="pozitia_pe_tara"))
    colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)]<-
      paste(  colnames(bac_unmatched)[!colnames(bac_unmatched) %in% colnames(data_merged_complete_matched)],"_bac",sep="")
    
    #find adm stuents who did not find a match
    adm_unmatched<-anti_join(data_adm_raw,data_merged_complete_matched,by=c("nume"="nume_adm","judet_adm"="judet_adm","liceu_repartizat"="liceu_repartizat","scoala_de_provenienta"="scoala_de_provenienta","media_la_admitere"="media_la_admitere"))
    colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)]<-
      paste(  colnames(adm_unmatched)[!colnames(adm_unmatched) %in% colnames(data_merged_complete_matched)],"_adm",sep="")  
    
    result<-list(bac_unmatched,adm_unmatched,data_merged_complete_matched,data_matched_diff_school)
  }
  
  result<-clean_result_partial(data_matched_diff_school)
  bac_unmatched<-result[[1]]
  adm_unmatched<-result[[2]]
  data_merged_complete_matched<-result[[3]]
  data_matched_diff_school<-result[[4]]
  
  #######################################################################
  #5. match students in diff schools and any county (similar names); for speed, loop by county and loop by first letter of name (minimizes the number of stringdistances calculated)
  data_matched_diff_school<-lapply(unique(bac_unmatched$judet_bac), function(county){
    result<-lapply(unique(str_extract(bac_unmatched$nume_bac,"^[a-zA-Z]")), function(letter){
      left<-adm_unmatched %>% filter(judet_adm!=county & str_extract(nume_adm,"^[a-zA-Z]")==letter)
      right<-bac_unmatched %>% filter(judet_bac==county & str_extract(nume_bac,"^[a-zA-Z]")==letter)
      result<-stringdist_join(left,
                              right,
                              by=c("nume_adm"="nume_bac"),
                              distance_col="dist",
                              max_dist=3) 
    return(result)  
    })
    result<-do.call("rbind",result)
    return(result)
  })
  data_matched_diff_school<-do.call("rbind",data_matched_diff_school)
  #test<-data_matched_diff_school %>% select(nume_adm,nume_bac,dist)
  
  #data_matched_diff_school<-stringdist_join(adm_unmatched,bac_unmatched,by=c("nume_adm"="nume_bac"),distance_col="dist",max_dist=3) 
  
  result<-clean_result_partial(data_matched_diff_school)
  bac_unmatched<-result[[1]]
  adm_unmatched<-result[[2]]
  data_merged_complete_matched<-result[[3]]
  data_matched_diff_school<-result[[4]]
  
  #######################################################################

  
  
  
  
  #add harmonized school name for the adm students with missing records
  #county_town_hs<-unique(data_bac_raw %>% select(judet_bac,unitate_de_invatamant,unitate_de_invatamant,an))
  #adm_unmatched<-base::merge(adm_unmatched,county_town_hs,by.x=c("judet_adm","unitate_de_invatamant"),by.y=c("judet_bac","unitate_de_invatamant"),all.x=T)
 
  #add unmatched bac and adm to the matched data
  data_merged_complete<-bind_rows(data_merged_complete_matched,bac_unmatched)
  data_merged_complete<-bind_rows(data_merged_complete,adm_unmatched) %>% arrange(unitate_de_invatamant,nume_bac,nume_adm)
  
  
  #add name, county  to rows with unmatched amd records
  data_merged_complete[is.na(data_merged_complete$judet_bac),]$judet_bac<-data_merged_complete[is.na(data_merged_complete$judet_bac),]$judet_adm
  data_merged_complete[is.na(data_merged_complete$nume_bac),]$nume_bac<-data_merged_complete[is.na(data_merged_complete$nume_bac),]$nume_adm
  data_merged_complete$an<-year
  
  #old version; replaced it with a 'not matched' marker
  #add unitate_de_invatamant of those adm students who are not in any HS that produces HS grads
  # data_merged_complete[is.na(data_merged_complete$unitate_de_invatamant),]$unitate_de_invatamant<-
  # data_merged_complete[is.na(data_merged_complete$unitate_de_invatamant),]$liceu_repartizat
  
  #add unitate_de_invatamant of those adm students who are not in any HS that produces HS grads
  data_merged_complete[is.na(data_merged_complete$unitate_de_invatamant),]$unitate_de_invatamant<-
  'HS NOT MATCHED'
    
  
  #test<-data_merged_complete %>% filter(is.na(unitate_de_invatamant) & liceu_repartizat=='')
  
  # schools<-data_merged_complete %>% 
  #   filter(!is.na(school_harmonized)) %>% 
  #   select(judet_bac,town_hs_bac,unitate_de_invatamant,school_harmonized,an) %>% 
  #   distinct(judet_bac,town_hs_bac,unitate_de_invatamant,school_harmonized,an)
  # 
  # data_merged_complete_adm<-data_merged_complete %>% filter(is.na(school_harmonized) & !is.na(liceu_repartizat))
  # data_merged_complete_adm<- merge( data_merged_complete_adm, schools,by=c("judet_bac","town_hs_bac","unitate_de_invatamant"))
  # colnames(data_merged_complete_adm)[colnames(data_merged_complete_adm)=='school_harmonized.x']<-'school_harmonized'
  # data_merged_complete_adm$school_harmonized<-data_merged_complete_adm$school_harmonized.y
  # data_merged_complete_adm<-data_merged_complete_adm %>% select(-school_harmonized.y)
  # 
  # data_merged_complete<-anti_join(data_merged_complete,data_merged_complete_adm,by=c("judet_bac","town_hs_bac","nume_adm"))
  #   
    
  match_rate<-1-dim(bac_unmatched)[1]/sum(data_bac_raw$promotie_anterioara=='NU')
  print("match rate bac")
  print(match_rate)
  match_rate<-1-dim(adm_unmatched)[1]/sum(data_adm_raw$liceu_repartizat!='')
  print("match rate adm")
  print(match_rate)
  

  #Add HS and HS progr ID x year
  data_merged_complete$adm_exam_ave<-NA
  data_merged_complete<-data_merged_complete %>% group_by(unitate_de_invatamant,judet_bac)  %>% mutate(HS_ID_year = group_indices()) 
  data_merged_complete<-data_merged_complete %>% group_by(unitate_de_invatamant,judet_bac,specializare_bac)  %>% mutate(HS_prog_ID_year = group_indices()) 
  

  data_merged_complete$nota_lb_materna<-as.numeric(data_merged_complete$nota_lb_materna)
  data_merged_complete$nota_matematica<-as.numeric(data_merged_complete$nota_matematica)
  data_merged_complete$nota_lb_romana<-as.numeric(data_merged_complete$nota_lb_romana)
  
  data_merged_complete<-as.data.frame(data_merged_complete)
  string<-'is.na(data_merged_complete$nota_lb_materna) | data_merged_complete$nota_lb_materna=="-" | data_merged_complete$nota_lb_materna==""'
  data_merged_complete[eval(parse(text=string)),]$adm_exam_ave<-rowMeans(data_merged_complete[eval(parse(text=string)),c("media_en_tsu","nota_matematica","nota_lb_romana")])
  data_merged_complete[!eval(parse(text=string)) ,]$adm_exam_ave<-rowMeans(data_merged_complete[!eval(parse(text=string)),c("nota_lb_materna","media_en_tsu","nota_matematica","nota_lb_romana")])
  data_merged_complete$adm_diff<-data_merged_complete$adm_exam_ave-data_merged_complete$media_de_absolvire
  
  data_merged_complete<-data_merged_complete %>% group_by(scoala_de_provenienta) %>% mutate(adm_exam_ave_sc=mean(adm_exam_ave),adm_four_year_ave_sc=mean(media_de_absolvire))
  data_merged_complete$adm_diff_school<-data_merged_complete$adm_exam_ave_sc-data_merged_complete$adm_four_year_ave_sc

  #city level stats
  #data_merged_complete<-data_merged_complete %>%
  #  group_by(judet_bac,town_hs_bac) %>%
  #  mutate(n_students_town=n(),n_ms_feeding_town=n_distinct(scoala_de_provenienta),n_schools_town=n_distinct(unitate_de_invatamant),n_progr_town=n_distinct(HS_prog_ID_year)) 
  #hs level stats
  data_merged_complete<-data_merged_complete %>% group_by(HS_ID_year) %>%
    mutate(threshold_school=min(media_la_admitere,na.rm=T),n_students_hs=n(),n_progr_hs=n_distinct(HS_prog_ID_year),n_ms_feeding_hs=n_distinct(scoala_de_provenienta))
  #progr level stats
  data_merged_complete<-data_merged_complete %>% group_by(HS_prog_ID_year) %>%
    mutate(threshold_progr=min(media_la_admitere,na.rm=T),n_students_progr=n())

  #Add language stats and Id of class
  #language stream is main language + potentially a foreign language that is intensively taught
  data_merged_complete$language_stream<-gsub("\\(.*","",data_merged_complete$specializare_lb)
  #main language is simply the main language of instruction
  data_merged_complete$language_main<-gsub("\\(.*|/.*","",data_merged_complete$specializare_lb)
  #second language
  data_merged_complete$language_second<-ifelse(grepl("LEN",data_merged_complete$specializare_lb),"Engleza",
                                      ifelse(grepl("LFR",data_merged_complete$specializare_lb),"Franceza",
                                      ifelse(grepl("LGE",data_merged_complete$specializare_lb),"Germana",     
                                      ifelse(grepl("LIT",data_merged_complete$specializare_lb),"Italiana",
                                      ifelse(grepl("LPO",data_merged_complete$specializare_lb),"Portugheza",
                                      ifelse(grepl("LSP",data_merged_complete$specializare_lb),"Spaniola",NA
                                                    ) )))))
  #create language stream level ID and get counts
  data_merged_complete<-data_merged_complete %>% group_by(unitate_de_invatamant,judet_bac,specializare_bac,specializare_lb)  %>% mutate(HS_prog_lang_ID_year = group_indices()) 

 
  #progr-language level stats
  data_merged_complete<-data_merged_complete %>% group_by(HS_prog_lang_ID_year) %>%
    mutate(threshold_progr_lang=min(media_la_admitere,na.rm=T),n_students_progr_lang=n()) %>% ungroup()
  
 
  
  #Keep only matched students
  data_merged_filtered<-data_merged_complete[data_merged_complete$dist!=100,]
  data_merged_filtered<-data_merged_filtered[data_merged_filtered$dist<6,]

  data<-list(data_merged_complete,data_merged_filtered)
  return(data)
}


# for (i in 1:3233){
#   if(dim(data_merged_complete_matched[[i]])[2]<45){print(i)}}
# test<-data_merged_complete_matched[[2838]]